Visualize distribution of amino-acid variants¶

InĀ [1]:
# Imports
import pandas as pd
import altair as alt

# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
    "#AA4499",
    "#88CCEE",
    "#EE7733",
    "#44AA99",
    "#1f78b4",
    "#CC6677",
    "#117733",
    "#999933",
    "#DDCC77",
    "#CC3311",
    "#882255",
    "#000000",
    "#DDDDDD",
]

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
InĀ [2]:
# this cell is tagged as `parameters` for papermill parameterization
variant_data = None
InĀ [3]:
# Parameters
variant_data = "results/variants/codon_variants.csv"
InĀ [4]:
# # Uncomment for running interactive
# variant_data = "../results/variants/codon_variants.csv"
InĀ [5]:
# Load data
variant_df = pd.read_csv(variant_data)

# Group all variants with >= 8 amino-acid mutations
variant_df["n_aa_substitutions"] = variant_df["n_aa_substitutions"].apply(lambda x: 8 if x >= 8 else x)

distribution_plot = alt.Chart(variant_df).mark_bar(color="#000000", size=7).encode(
    x=alt.X(
        "n_aa_substitutions", 
        axis=alt.Axis(
            title="AA muts",
            values=[0, 1, 2, 3, 4, 5, 6, 7, 8],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labelFontSize=8,
            labelFontWeight="normal",
            titleFontWeight="normal",
        ),
    ),
    y=alt.Y(
        "count()",
        axis=alt.Axis(
            title="number of variants",
            values=[0, 5000, 10000, 15000],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labelFontSize=8,
            labelFontWeight="normal",
            titleFontWeight="normal",
        ),
        scale=alt.Scale(domain=[0,15000]),
    ),
    facet=alt.Facet(
        "library", 
        title=None, 
        columns=2,
        header=alt.Header(
            labelFontSize=8,
            labelFontWeight="bold",
        ),
    ),
).properties(
    width=75, 
    height=100,
).configure_axis(
    grid=False,
    labelFontSize=8,
    titleFontSize=8,
    labelFontWeight="normal",
) 

distribution_plot
Out[5]:

Calculate number of mutations measured in each library in single mutant barcoded variants and in total

InĀ [6]:
# Single mutant variant counts
libA_single_mutants = (
    variant_df.query("library == 'LibA' & n_aa_substitutions == 1")
    .reset_index(drop=True)
    ["aa_substitutions"].nunique()
)
print(f"Number of mutations measured in single mutant variants in library A: {libA_single_mutants} ({libA_single_mutants/9820 * 100:.1f} % possible mutations)")

libB_single_mutants = (
    variant_df.query("library == 'LibB' & n_aa_substitutions == 1")
    .reset_index(drop=True)
    ["aa_substitutions"].nunique()
)
print(f"Number of mutations measured in single mutant variants in library B: {libB_single_mutants} ({libB_single_mutants/9820 * 100:.1f} % possible mutations)")

# All mutant variant counts
libA_all_mutants = (
    variant_df.query("library == 'LibA' & n_aa_substitutions >= 1")
    .reset_index(drop=True)
)
libA_all_mutants["list_mutations"] = libA_all_mutants.apply(lambda x: x["aa_substitutions"].split(" "), axis=1)
libA_all_mutants = (
    libA_all_mutants.explode("list_mutations")
    ["list_mutations"].nunique()
)
print(f"Number of mutations measured in all barcoded variants in library A: {libA_all_mutants} ({libA_all_mutants/9820 * 100:.1f} % possible mutations)")

libB_all_mutants = (
    variant_df.query("library == 'LibB' & n_aa_substitutions >= 1")
    .reset_index(drop=True)
)
libB_all_mutants["list_mutations"] = libB_all_mutants.apply(lambda x: x["aa_substitutions"].split(" "), axis=1)
libB_all_mutants = (
    libB_all_mutants.explode("list_mutations")
    ["list_mutations"].nunique()
)
print(f"Number of mutations measured in all barcoded variants in library B: {libB_all_mutants} ({libB_all_mutants/9820 * 100:.1f} % possible mutations)")
Number of mutations measured in single mutant variants in library A: 5737 (58.4 % possible mutations)
Number of mutations measured in single mutant variants in library B: 6458 (65.8 % possible mutations)
Number of mutations measured in all barcoded variants in library A: 9725 (99.0 % possible mutations)
Number of mutations measured in all barcoded variants in library B: 9789 (99.7 % possible mutations)